{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import json\n",
    "import re\n",
    "import ast\n",
    "import copy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset\n",
    "humaneval_data_files = [e for e in os.listdir('humaneval_data/') if e.endswith('.py')]\n",
    "mbpp_data_files = [e for e in os.listdir('mbpp_data/') if e.endswith('.py')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# error coverage\n",
    "coverage_error_note = open(\"coverage_note.txt\").readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pynguin gen tests .replace('test_','')\n",
    "humaneval_test_3_files = [e for e in os.listdir('humaneval_test_3/') if 'test_problem_id_' in e]\n",
    "humaneval_test_5_files = [e for e in os.listdir('humaneval_test_5/') if 'test_problem_id_' in e]\n",
    "humaneval_test_perfect_files = [e for e in os.listdir('humaneval_test_perfect/') if 'test_problem_id_' in e and e.endswith('.py')]\n",
    "\n",
    "mbpp_test_3_files = [e for e in os.listdir('mbpp_test_3/') if 'test_problem_id_' in e]\n",
    "mbpp_test_5_files = [e for e in os.listdir('mbpp_test_5/') if 'test_problem_id_' in e]\n",
    "mbpp_test_perfect_files = [e for e in os.listdir('mbpp_test_perfect/') if 'test_problem_id_' in e and e.endswith('.py')]\n",
    "\n",
    "test_path = ['3', '5','perfect']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# coverage report\n",
    "coverage_humaneval_3 = json.load(open('coverage_humaneval_3.json'))\n",
    "coverage_humaneval_5 = json.load(open('coverage_humaneval_5.json'))\n",
    "coverage_humaneval_perfect = json.load(open('coverage_humaneval_perfect.json'))\n",
    "\n",
    "coverage_mbpp_3 = json.load(open('coverage_mbpp_3.json'))\n",
    "coverage_mbpp_5 = json.load(open('coverage_mbpp_5.json'))\n",
    "coverage_mbpp_perfect = json.load(open('coverage_mbpp_perfect.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mutation report\n",
    "mutation_humaneval_3 = open('humaneval_test_3/mutation_humaneval_3.txt').readlines()\n",
    "mutation_humaneval_5 = open('humaneval_test_5/mutation_humaneval_5.txt').readlines()\n",
    "mutation_humaneval_perfect = open('humaneval_test_perfect/mutation_humaneval_perfect.txt').readlines()\n",
    "\n",
    "mutation_mbpp_3 = open('mbpp_test_3/mutation_mbpp_3.txt').readlines()\n",
    "mutation_mbpp_5 = open('mbpp_test_5/mutation_mbpp_5.txt').readlines()\n",
    "mutation_mbpp_perfect = open('mbpp_test_perfect/mutation_mbpp_perfect.txt').readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "humaneval_test_list = [humaneval_test_3_files, humaneval_test_5_files, humaneval_test_perfect_files]\n",
    "mbpp_test_list = [mbpp_test_3_files, mbpp_test_5_files, mbpp_test_perfect_files]\n",
    "\n",
    "humaneval_coverage_list = [coverage_humaneval_3, coverage_humaneval_5, coverage_humaneval_perfect]\n",
    "mbpp_coverage_list = [coverage_mbpp_3, coverage_mbpp_5, coverage_mbpp_perfect]\n",
    "\n",
    "humaneval_mutation_list = [mutation_humaneval_3, mutation_humaneval_5, mutation_humaneval_perfect]\n",
    "mbpp_mutation_list = [mutation_mbpp_3, mutation_mbpp_5, mutation_mbpp_perfect]\n",
    "\n",
    "test_type = ['3-test', '5-test','multi perfect']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_generate(data_files, test_files):\n",
    "    temp_test_files = [e.replace('test_','') for e in test_files]\n",
    "    return round(len(set(data_files).intersection(set(temp_test_files)))/len(data_files) * 100, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HumanEval Dataset\n",
      "3-test 87.8 (144/164)\n",
      "5-test 85.37 (140/164)\n",
      "multi perfect 86.59 (142/164)\n",
      "\n",
      "MBPP Dataset\n",
      "3-test 91.44 (235/257)\n",
      "5-test 90.66 (233/257)\n",
      "multi perfect 89.49 (230/257)\n"
     ]
    }
   ],
   "source": [
    "print('HumanEval Dataset')\n",
    "for i, test_files in enumerate(humaneval_test_list):\n",
    "    print(test_type[i], check_generate(humaneval_data_files, test_files), f'({len(test_files)}/{len(humaneval_data_files)})')\n",
    "print()\n",
    "print('MBPP Dataset')\n",
    "for i, test_files in enumerate(mbpp_test_list):\n",
    "    print(test_type[i], check_generate(mbpp_data_files, test_files), f'({len(test_files)}/{len(mbpp_data_files)})')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compilable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_compilable(test_files, rootpath, data_files):\n",
    "    n_parsed = 0\n",
    "    for file in test_files:\n",
    "        temp_text = open(rootpath + file).read()\n",
    "        if temp_text == '':\n",
    "            continue\n",
    "        try:\n",
    "            ast.parse(temp_text)\n",
    "            n_parsed += 1\n",
    "        except e:\n",
    "            pass\n",
    "    return round(n_parsed/len(data_files)*100, 2), n_parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HumanEval Dataset\n",
      "3-test 87.8 (Pass AST: 144/164)\n",
      "5-test 85.37 (Pass AST: 140/164)\n",
      "multi perfect 86.59 (Pass AST: 142/164)\n",
      "\n",
      "MBPP Dataset\n",
      "3-test 91.44 (Pass AST: 235/257)\n",
      "5-test 90.66 (Pass AST: 233/257)\n",
      "multi perfect 89.49 (Pass AST: 230/257)\n"
     ]
    }
   ],
   "source": [
    "print('HumanEval Dataset')\n",
    "for i, test_files in enumerate(humaneval_test_list):\n",
    "    score, count = check_compilable(test_files, f'humaneval_test_{test_path[i]}/', humaneval_data_files)\n",
    "    print(test_type[i], score, f'(Pass AST: {count}/{len(humaneval_data_files)})')\n",
    "print()\n",
    "print('MBPP Dataset')\n",
    "for i, test_files in enumerate(mbpp_test_list):\n",
    "    score, count = check_compilable(test_files, f'mbpp_test_{test_path[i]}/', mbpp_data_files)\n",
    "    print(test_type[i], score, f'(Pass AST: {count}/{len(mbpp_data_files)})')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Passing Test Case"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filtered Not Pass Coverage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Files with error test case(s).\n",
      "humaneval_test_perfect\t:\t5\n",
      "humaneval_test_5\t:\t2\n",
      "humaneval_test_3\t:\t3\n",
      "mbpp_test_perfect\t:\t25\n",
      "mbpp_test_5\t:\t27\n",
      "mbpp_test_3\t:\t29\n"
     ]
    }
   ],
   "source": [
    "coverage_error_dict = {}\n",
    "k = \"\"\n",
    "for line in coverage_error_note:\n",
    "    if line.startswith('coverage run -m '):\n",
    "        k = line.split()[-1]\n",
    "        coverage_error_dict[k] = set()\n",
    "    elif line.startswith('FAILED '):\n",
    "        temp = line.split()[1]\n",
    "        temp = temp.split(\"::\")[0]\n",
    "        coverage_error_dict[k].add(temp)\n",
    "print('Files with error test case(s).')\n",
    "for k in coverage_error_dict:\n",
    "    coverage_error_dict[k] = list(coverage_error_dict[k])\n",
    "    print(f\"{k}\\t:\\t{len(coverage_error_dict[k])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['humaneval_test_3/test_problem_id_22.py',\n",
       " 'humaneval_test_3/test_problem_id_16.py',\n",
       " 'humaneval_test_3/test_problem_id_29.py']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coverage_error_dict['humaneval_test_3']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_coverage_error_files(coverage_error_list, coverage_report):\n",
    "    filtered_coverage_report = copy.deepcopy(coverage_report)\n",
    "    for filename in coverage_error_list:\n",
    "        if filename in filtered_coverage_report['files']:\n",
    "            data_filename = filename.replace('/test_', '/')\n",
    "            del filtered_coverage_report['files'][data_filename]\n",
    "            del filtered_coverage_report['files'][filename]\n",
    "            # print('deleted')\n",
    "    return filtered_coverage_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_coverage_humaneval_3 = filter_coverage_error_files(coverage_error_dict['humaneval_test_3'], coverage_humaneval_3)\n",
    "filtered_coverage_humaneval_5 = filter_coverage_error_files(coverage_error_dict['humaneval_test_5'], coverage_humaneval_5)\n",
    "filtered_coverage_humaneval_perfect = filter_coverage_error_files(coverage_error_dict['humaneval_test_perfect'], coverage_humaneval_perfect)\n",
    "\n",
    "filtered_coverage_mbpp_3 = filter_coverage_error_files(coverage_error_dict['mbpp_test_3'], coverage_mbpp_3)\n",
    "filtered_coverage_mbpp_5 = filter_coverage_error_files(coverage_error_dict['mbpp_test_5'], coverage_mbpp_5)\n",
    "filtered_coverage_mbpp_perfect = filter_coverage_error_files(coverage_error_dict['mbpp_test_perfect'], coverage_mbpp_perfect)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "humaneval_filtered_coverage_list = [filtered_coverage_humaneval_3, filtered_coverage_humaneval_5, filtered_coverage_humaneval_perfect]\n",
    "mbpp_filtered_coverage_list = [filtered_coverage_mbpp_3, filtered_coverage_mbpp_5, filtered_coverage_mbpp_perfect]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filter Not Pass Mutation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    " # Mutation (MutPy) won't run when any test cases fail, so no need to filter."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Coverage Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_coverage(coverage_report, data_files):\n",
    "    filter_files = [f for f in list(coverage_report['files']) if 'test_problem_id_' not in f and f.endswith('.py')]\n",
    "    count = len(filter_files)\n",
    "    coverage_score = []\n",
    "    for file in filter_files:\n",
    "        coverage_score.append(coverage_report['files'][file]['summary']['percent_covered'])\n",
    "    # print(coverage_score)\n",
    "    return round(sum(coverage_score) / len(data_files), 2), count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HumanEval Dataset\n",
      "3-test 87.04 (Available: 144/164=87.8)\n",
      "5-test 84.65 (Available: 140/164=85.37)\n",
      "multi perfect 85.91 (Available: 142/164=86.59)\n",
      "\n",
      "MBPP Dataset\n",
      "3-test 87.03 (Available: 235/257=91.44)\n",
      "5-test 86.79 (Available: 233/257=90.66)\n",
      "multi perfect 84.64 (Available: 230/257=89.49)\n"
     ]
    }
   ],
   "source": [
    "print('HumanEval Dataset')\n",
    "for i, test_files in enumerate(humaneval_coverage_list):\n",
    "    score, count = calculate_coverage(test_files, humaneval_data_files)\n",
    "    print(test_type[i], score, f'(Available: {count}/{len(humaneval_data_files)}={round(count/len(humaneval_data_files)*100,2)})')\n",
    "print()\n",
    "print('MBPP Dataset')\n",
    "for i, test_files in enumerate(mbpp_coverage_list):\n",
    "    score, count = calculate_coverage(test_files, mbpp_data_files)\n",
    "    print(test_type[i], score, f'(Available: {count}/{len(mbpp_data_files)}={round(count/len(mbpp_data_files)*100,2)})')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HumanEval Dataset\n",
      "3-test 85.21 (Available: 141/164=85.98)\n",
      "5-test 83.43 (Available: 138/164=84.15)\n",
      "multi perfect 82.87 (Available: 137/164=83.54)\n",
      "\n",
      "MBPP Dataset\n",
      "3-test 76.67 (Available: 206/257=80.16)\n",
      "5-test 77.0 (Available: 206/257=80.16)\n",
      "multi perfect 75.86 (Available: 205/257=79.77)\n"
     ]
    }
   ],
   "source": [
    "print('HumanEval Dataset')\n",
    "for i, test_files in enumerate(humaneval_filtered_coverage_list):\n",
    "    score, count = calculate_coverage(test_files, humaneval_data_files)\n",
    "    print(test_type[i], score, f'(Available: {count}/{len(humaneval_data_files)}={round(count/len(humaneval_data_files)*100,2)})')\n",
    "print()\n",
    "print('MBPP Dataset')\n",
    "for i, test_files in enumerate(mbpp_filtered_coverage_list):\n",
    "    score, count = calculate_coverage(test_files, mbpp_data_files)\n",
    "    print(test_type[i], score, f'(Available: {count}/{len(mbpp_data_files)}={round(count/len(mbpp_data_files)*100,2)})')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mutation Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_mutation(mutation_report, data_files):\n",
    "    regex_pattern = r'[\\n\\r]*Mutation score.*: [ \\t]*([^\\n\\r]*)' \n",
    "    mutation_score = []\n",
    "    count = 0\n",
    "    for line in mutation_report:\n",
    "        if '[*] Mutation score' in line:\n",
    "            mutation_score.append(float(re.findall(regex_pattern, line)[0][:-1]))\n",
    "            count += 1\n",
    "    # print(mutation_score)\n",
    "    return round(sum(mutation_score) / len(data_files), 2), count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_mutation_operator(mutation_report):\n",
    "    regex_pattern = r'   - [[\\[#   \\d\\]]+([^\\n]*) problem_id_'\n",
    "    mutation_operator_dict = {}\n",
    "    mutation_report_full = \"\\n\".join(mutation_report)\n",
    "    mutation_operator = re.findall(regex_pattern, mutation_report_full)\n",
    "    for op in mutation_operator:\n",
    "        if op not in mutation_operator_dict:\n",
    "            mutation_operator_dict[op] = 0\n",
    "        mutation_operator_dict[op] += 1\n",
    "    mutation_operator_dict = dict(sorted(mutation_operator_dict.items(), key=lambda item: item[1], reverse=True))\n",
    "    return mutation_operator_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HumanEval Dataset\n",
      "3-test 23.61 (Available: 140/164=85.37)\n",
      "5-test 21.45 (Available: 138/164=84.15)\n",
      "multi perfect 25.59 (Available: 136/164=82.93)\n",
      "\n",
      "MBPP Dataset\n",
      "3-test 19.02 (Available: 206/257=80.16)\n",
      "5-test 21.71 (Available: 206/257=80.16)\n",
      "multi perfect 20.71 (Available: 205/257=79.77)\n"
     ]
    }
   ],
   "source": [
    "print('HumanEval Dataset')\n",
    "for i, test_files in enumerate(humaneval_mutation_list):\n",
    "    score, count = calculate_mutation(test_files, humaneval_data_files)\n",
    "    print(test_type[i], score, f'(Available: {count}/{len(humaneval_data_files)}={round(count/len(humaneval_data_files)*100,2)})')\n",
    "print()\n",
    "print('MBPP Dataset')\n",
    "for i, test_files in enumerate(mbpp_mutation_list):\n",
    "    score, count = calculate_mutation(test_files, mbpp_data_files)\n",
    "    print(test_type[i], score, f'(Available: {count}/{len(mbpp_data_files)}={round(count/len(mbpp_data_files)*100,2)})')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HumanEval Dataset\n",
      "3-test {'AOR': 365, 'ROR': 225, 'COI': 151, 'ASR': 59, 'SIR': 39, 'AOD': 37, 'LCR': 24, 'COD': 12, 'BCR': 2, 'EHD': 1}\n",
      "5-test {'AOR': 355, 'ROR': 224, 'COI': 149, 'ASR': 58, 'SIR': 39, 'AOD': 37, 'LCR': 23, 'COD': 12, 'BCR': 2, 'EHD': 1}\n",
      "multi perfect {'AOR': 348, 'ROR': 211, 'COI': 144, 'ASR': 62, 'SIR': 39, 'AOD': 37, 'LCR': 22, 'COD': 12, 'BCR': 2, 'EHD': 1}\n",
      "\n",
      "MBPP Dataset\n",
      "3-test {'AOR': 611, 'ROR': 233, 'COI': 161, 'ASR': 51, 'SIR': 34, 'LOR': 32, 'AOD': 27, 'LCR': 22, 'BCR': 10, 'COD': 9, 'EHD': 1, 'EXS': 1}\n",
      "5-test {'AOR': 597, 'ROR': 234, 'COI': 161, 'ASR': 51, 'SIR': 34, 'LOR': 32, 'AOD': 27, 'LCR': 22, 'BCR': 10, 'COD': 8, 'EHD': 1, 'EXS': 1}\n",
      "multi perfect {'AOR': 588, 'ROR': 226, 'COI': 155, 'ASR': 48, 'SIR': 34, 'LOR': 32, 'AOD': 27, 'LCR': 21, 'BCR': 10, 'COD': 9, 'EHD': 1, 'EXS': 1}\n"
     ]
    }
   ],
   "source": [
    "print('HumanEval Dataset')\n",
    "for i, test_files in enumerate(humaneval_mutation_list):\n",
    "    mutation_operator = check_mutation_operator(test_files)\n",
    "    print(test_type[i], mutation_operator)\n",
    "print()\n",
    "print('MBPP Dataset')\n",
    "for i, test_files in enumerate(mbpp_mutation_list):\n",
    "    mutation_operator = check_mutation_operator(test_files)\n",
    "    print(test_type[i], mutation_operator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (py310)",
   "language": "python",
   "name": "py310"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
